Libraries

# data wrangling
library(plyr)
library(dplyr)
library(tidyr)
library(reshape)
library(stringr)

# maps and geography
library(rgeos)
library(sf) # World map
library(rnaturalearth)

# visualisation
library(ggplot2)
library(pheatmap)
library(RColorBrewer)
library(ggrepel)
library(cowplot)

# differential abundance analysis
library(DESeq2)

Data

Metadata

The metadata contains information about the samples, such as the sampling date, the Longhurst region, the biome, GPS locations, and measurements like the temperature, salinity, oxygen, etc.

meta <- read.csv("/Users/jegoussc/Repositories/hanoxy/data/info/metadata.csv", 
  sep = ',')

meta$sampling_date = as.Date(meta$sampling_date)
# head(meta)

Function to get the seasons.

date2season <- function(DATES) {
    WS <- as.Date("2012-12-15", format = "%Y-%m-%d") # Winter Solstice
    SE <- as.Date("2012-3-15",  format = "%Y-%m-%d") # Spring Equinox
    SS <- as.Date("2012-6-15",  format = "%Y-%m-%d") # Summer Solstice
    FE <- as.Date("2012-9-15",  format = "%Y-%m-%d") # Autumn Equinox

    # Convert dates from any year to 2012 dates
    d <- as.Date(strftime(DATES, format="2012-%m-%d"))

    ifelse (d >= WS | d < SE, "Winter",
      ifelse (d >= SE & d < SS, "Spring",
        ifelse (d >= SS & d < FE, "Summer", "Autumn")))
}

Add the season parameter to metadata.

meta$season = date2season(meta$sampling_date)

The global ocean and TARA ocean

First, we retrieve the world map data which relies on the rnaturalearth package by Andy South (2017).

world_map <- rnaturalearth::ne_countries(scale = 'small', returnclass = c("sf"))

Draw the base map.

map = ggplot() +
  geom_sf(data = world_map, size = .2, fill = 'white', col = 'white') +
  theme(panel.grid.major = element_line(color = "grey", linetype = "dashed", size = 0.25)) +
  coord_sf(expand = FALSE) + 
  labs(x = 'Longitude', y = 'Latitude') 
map

Longhurst provinces

Longhurst provinces represent a partition of the world oceans into provinces as defined by Longhurst (1995; 1998; 2006) based on the prevailing role of physical forcing as a regulator of phytoplankton distribution. Initially, these static boundaries were developed at the Bedford Institute of Oceanography, Canada.

Note that the boundaries of these provinces are not fixed in time and space, but are dynamic and move under seasonal and inter-annual changes in physical forcing.

Data set was downloaded from https://www.marineregions.org/downloads.php.

# import longhurst provinces
longhurst <- sf::read_sf("/Users/jegoussc/Repositories/haliea/INFOS/longhurst-world-v4-2010.shx")

Draw the blank world map with the Longhurst provinces.

longhurst_map = map + 
  geom_sf(data = longhurst, aes(fill = ProvCode), 
    size = .1, col = "white", alpha = .25) +
  geom_sf_text(data = longhurst %>% 
      group_by(ProvCode), aes(label = ProvCode),
    colour = "black", size = 3, check_overlap = TRUE) +
  scale_fill_manual(values = rev(colorRampPalette(brewer.pal(9, "YlGnBu"))(54))) +
  coord_sf(expand = FALSE) + 
  theme(legend.position = "none") + 
  ggtitle(paste("Longhurst Biogeochemical Provinces -",
    length(unique(longhurst$ProvCode)),"provinces"))

longhurst_map

Biomes

At the first level of reduction, Longhurst recognized four principal biomes (also referred to as domains in earlier publications): the Polar biome, the Westerlies biome, the Trade-Winds biome, and the Coastal Boundary Zone biome. These four biomes are recognizable in every major ocean basin. At the next level of reduction, the ocean basins are partitioned into provinces, roughly ten for each basin. These partitions provide a template for data analysis or for making parameter assignments on a global scale.

Add a column for a short biome or domain name.

longhurst$biome = str_split(longhurst$ProvDescr, pattern = " - ", simplify = TRUE)[,1]

# set order 
longhurst$biome =  factor(longhurst$biome , levels = c("Polar", "Westerlies", "Trades", "Coastal"))

Draw the world map with the biomes

biomes_map = map + 
  geom_sf(data = longhurst, aes(fill = biome), size = .1, col = "white", alpha = .5) +
  theme(legend.position = "bottom", legend.direction = "horizontal") + 
  scale_fill_brewer(palette = "RdBu", direction = -1) +
  coord_sf(expand = FALSE) + 
  labs(x = 'Longitude', y = 'Latitude', fill = "Biome") +
  ggtitle("Biomes associated with Longhurst Biogeochemical Provinces")

biomes_map

legend_biome <- get_legend(biomes_map)
plot_grid(longhurst_map, biomes_map + theme(legend.position = "none"), NULL, legend_biome,
  labels = c("A", "B", "", ""), label_size = 12, 
  ncol = 1, align = 'vh', axis = 'l', 
  rel_heights = c(1,1,-.2,0.5), 
  rel_widths = c(1,1,1,0.5))

ggsave(
  "fig-s1.pdf",
  plot = last_plot(),
  device = "pdf",
  scale = 1,
  width = 6,
  height = 7.5,
  dpi = 300
)
biomes_map +
  geom_point(aes(
    x = meta$longitude, 
    y = meta$latitude), shape = 3) + 
  geom_text_repel(aes(
    x = meta$longitude, 
    y = meta$latitude, 
  label=meta$station_name),
    size = 2
  )

meta %>%
  group_by(biome) %>%
  select(biome, temperature) %>%
  summarise_at(vars(temperature), list(temp = mean))
## # A tibble: 4 × 2
##   biome       temp
##   <chr>      <dbl>
## 1 Coastal    21.8 
## 2 Polar       2.68
## 3 Trade wind 25.1 
## 4 Westerly   19.4
meta %>%
  group_by(biome) %>%
  select(biome, salinity) %>%
  summarise_at(vars(salinity), list(sali = mean))
## # A tibble: 4 × 2
##   biome       sali
##   <chr>      <dbl>
## 1 Coastal     35.4
## 2 Polar       33.0
## 3 Trade wind  35.3
## 4 Westerly    36.7
meta %>%
  group_by(biome) %>%
  select(biome, nitrate) %>%
  summarise_at(vars(nitrate), list(nit = mean), na.rm = TRUE)
## # A tibble: 4 × 2
##   biome        nit
##   <chr>      <dbl>
## 1 Coastal    2.48 
## 2 Polar      5.82 
## 3 Trade wind 0.838
## 4 Westerly   0.861
meta %>%
  group_by(biome) %>%
  select(biome, chlorophyll) %>%
  summarise_at(vars(chlorophyll), list(chloro_mean = mean, chloro_sd = sd), na.rm = TRUE)
## # A tibble: 4 × 3
##   biome      chloro_mean chloro_sd
##   <chr>            <dbl>     <dbl>
## 1 Coastal          0.277     0.415
## 2 Polar            1.32      2.14 
## 3 Trade wind       0.137     0.119
## 4 Westerly         0.193     0.212

Genomes

gtdb_info = read.delim("/Users/jegoussc/Repositories/hanoxy/data/info/haliea-gtdb.tsv")
genome_info = gtdb_info %>% separate(data = ., col = ncbi_taxo, 
  into = c('ncbi_domain', 'ncbi_phylum', 'ncbi_class', 'ncbi_order', 'ncbi_family', 'ncbi_genus', 'ncbi_species'), 
  sep = ";") %>%
  separate(data = ., col = gtdb_taxo, into = c('domain', 'gtdb_phylum', 'gtdb_class', 'gtdb_order', 'gtdb_family', 'gtdb_genus', 'gtdb_species'), sep = ";") %>%
  mutate(gtdb_species = substr(gtdb_species,5,nchar(gtdb_species)),
    gtdb_genus = substr(gtdb_genus,5,nchar(gtdb_genus))) %>%
  dplyr::select(id, ncbi_organism_name, gtdb_genus, gtdb_species)

Count data

Count data computed by CoverM, which uses the mapping algorythm minimap2. The count data is the number of reads aligned toq each genome. Note that a single read may be aligned to multiple genomes with supplementary alignments.

Locate the files

count_dir <- "/Users/jegoussc/Repositories/hanoxy/results/counts"
count_files <- list.files(path = count_dir, pattern = 'tsv',
  full.names = TRUE)

Read count data files

counts <- NA
for (f in count_files) {
 # print(f)
  r <- read.delim(file = f, header = TRUE,
    row.names = 1)
  counts = cbind(counts, r)
}

Rename columns

colnames(counts) <- substr(colnames(counts), 1, 8)
rownames(counts) <- substr(rownames(counts), 1, 15)

Inspect the data frame

Cleanup the data frame

counts = counts %>% 
  select(-counts) # remove the empty counts column

Melt dataset

mcounts = counts %>%
  t() %>% # transpose
  melt() # melt
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
# rename the columns
colnames(mcounts) = c("station_name", "genome", "count")

# inspect the resulting dataframe
mcounts %>%
  head()
##   station_name          genome count
## 1     MIME_001 GCA_000423125.1  1567
## 2     MIME_002 GCA_000423125.1   592
## 3     TARA_030 GCA_000423125.1 15917
## 4     TARA_031 GCA_000423125.1 10027
## 5     TARA_032 GCA_000423125.1  7349
## 6     TARA_033 GCA_000423125.1  5758

Explore distribution

hist(mcounts$count, breaks = 100,
  main = "Histogram of counts", xlab = "Counts")

mcts_genus = counts %>% 
  mutate(id = rownames(counts)) %>%
  left_join(., genome_info, by = 'id') %>%
  group_by(gtdb_genus) %>%
  summarise(across(starts_with(c("TARA", "MIME")), mean)) %>%
  select(gtdb_genus, starts_with(c("TARA", "MIME"))) %>%
  as.data.frame() %>%
  filter(!is.na(gtdb_genus)) %>%
  melt()
## Using gtdb_genus as id variables
colnames(mcts_genus) = c("gtdb_genus", "station_name", "count")
mcts_genus$count = round(mcts_genus$count, digits = 0)

mcts_genus = left_join(mcts_genus, meta, by = "station_name")

Analysis

Relative proportions

Scale count data to library size to know the proportions of metagenomic reads recruited by the Halieaceae genomes in percentages.

perc = counts
for (s in colnames(counts)) {
  perc[,s] = counts[,s]/meta[meta$station_name == s,]$filtered_reads * 100 
}

Per genus

mperc_genus = perc %>%
  round(., digits = 3) %>%
  mutate(id = rownames(.)) %>%
  left_join(., genome_info, by = 'id') %>%
  group_by(gtdb_genus) %>%
   summarise(across(where(is.numeric), sum)) %>%
  as.data.frame() %>%
  melt()
## Using gtdb_genus as id variables
colnames(mperc_genus) = c("genus", "station_name", "abundance")

Mean abundance per genus

mperc_genus %>% select(genus, abundance) %>% group_by(genus) %>%
    summarise(across(where(is.numeric), c(mean = mean, sd = sd))) %>%
  arrange(abundance_mean) 
## # A tibble: 13 × 3
##    genus          abundance_mean abundance_sd
##    <chr>                   <dbl>        <dbl>
##  1 Halioglobus_A         0.0024       0.00191
##  2 Chromatocurvus        0.00552      0.00142
##  3 NZNC01                0.00688      0.0147 
##  4 Pseudohaliea          0.00814      0.00313
##  5 GCA-2748505           0.00925      0.0638 
##  6 Aequoribacter         0.00962      0.00449
##  7 CABYJX01              0.0101       0.00388
##  8 Kineobactrum          0.0135       0.0244 
##  9 Haliea                0.0196       0.0106 
## 10 Congregibacter        0.0217       0.00819
## 11 Parahaliea            0.0296       0.0129 
## 12 Halioglobus           0.139        0.0883 
## 13 Luminiphilus          0.422        0.345

Per species

mperc_species = perc %>%
  round(., digits = 3) %>%
  mutate(id = rownames(.)) %>%
  left_join(., genome_info, by = 'id') %>%
  group_by(gtdb_species) %>%
   summarise(across(where(is.numeric), sum)) %>%
  as.data.frame() %>%
  melt()
## Using gtdb_species as id variables
colnames(mperc_species) = c("species", "station_name", "abundance")
mperc_species %>% select(species, abundance) %>% 
  group_by(species) %>%
    summarise(across(where(is.numeric), mean)) %>% 
  arrange(abundance)
## # A tibble: 69 × 2
##    species                    abundance
##    <chr>                          <dbl>
##  1 Congregibacter sp002428935   0.00131
##  2 Halioglobus sp009937575      0.00157
##  3 Halioglobus sp013214285      0.00169
##  4 Luminiphilus sp002390485     0.00171
##  5 Luminiphilus sp008081075     0.0018 
##  6 Luminiphilus sp002389505     0.00205
##  7 Haliea sp002414665           0.0022 
##  8 Halioglobus_A pacificus      0.0024 
##  9 Luminiphilus sp013911345     0.00243
## 10 Luminiphilus sp009886815     0.00263
## # … with 59 more rows

Merge the percentages of metagenomic reads recruited by the Halieaceae genomes.

mperc_genus = merge(mperc_genus, meta)
mperc_species = merge(mperc_species, meta)

Percentage per biome

mperc_species %>%
  group_by(biome) %>%
  summarise_at(., vars(abundance), c(mean, min, max, sd)) %>%
  select(fn1) %>% mutate(fn1 = fn1 *100) %>% round(., digits = 1)
## # A tibble: 4 × 1
##     fn1
##   <dbl>
## 1   1.4
## 2   0.7
## 3   1  
## 4   1
ggplot(mperc_genus, aes(x=abundance, fill = genus, colour = genus)) + 
  geom_density(alpha = .5) + 
  facet_grid(genus~., margins = TRUE, scales = "free") + 
  theme_minimal() + 
  scale_fill_manual(values = colorRampPalette(brewer.pal(8, "Spectral"))(14)) +
  scale_colour_manual(values = colorRampPalette(brewer.pal(8, "Spectral"))(14))

ggsave(
  "fig-s2.pdf",
  plot = last_plot(),
  device = "pdf",
  scale = 1,
  width = 8,
  height = 9,
  dpi = 300
)
dat = mperc_species %>%
  select(station_name, abundance) %>%
  group_by(station_name) %>%
   summarise(across(where(is.numeric), sum)) %>%
  as.data.frame() %>%
  left_join(., meta)
## Joining, by = "station_name"
map +
  geom_point(aes(
    x = dat$longitude, 
    y = dat$latitude,
    size = dat$abundance,
    colour = dat$biome),
    alpha = .75) +
  scale_colour_manual(values = c(brewer.pal(4, "RdBu")[1], brewer.pal(4, "RdBu")[4], brewer.pal(4, "RdBu")[2], brewer.pal(4, "RdBu")[3])) +
  labs(#title = "Global distribution of the Halieaceae family",
    colour = "Biome", size = "Abundance (%)",
    #caption = "Percentages of metagenomics reads mapped to 69 de-replicated Halieaceae genomes\nat sampling stations representing the four marine biomes."
    )

ggsave(
  "fig-2.pdf",
  plot = last_plot(),
  device = "pdf",
  scale = 1,
  width = 8,
  height = 4,
  dpi = 300
)

Genus level

Normalisation with DESeq

This is an alternative to the scaling. Indeed, DESeq needs un-normalised count data as input.

http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#the-deseqdataset

Merge count data with genome info

cts_genus = counts %>% 
  mutate(id = rownames(counts)) %>%
  left_join(., genome_info, by = 'id') %>%
  group_by(gtdb_genus) %>%
  summarise(across(starts_with(c("TARA", "MIME")), mean)) %>%
  select(gtdb_genus, starts_with(c("TARA", "MIME"))) %>%
  as.data.frame() %>%
  filter(!is.na(gtdb_genus))

rownames(cts_genus) = cts_genus$gtdb_genus
cts_genus = cts_genus %>% select(starts_with(c("TARA", "MIME")))
cts_genus = round(cts_genus, digits = 0)
coldata = meta %>% filter(station_name %in% colnames(cts_genus))
rownames(coldata) = coldata$station_name

coldata = coldata %>% 
  mutate(condition = ifelse(biome == "Polar", "Polar", "Non Polar"))

all(rownames(coldata) == colnames(cts_genus))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData = cts_genus,
                              colData = coldata,
                              design = ~ biome) # biome or condition?
dds
## class: DESeqDataSet 
## dim: 13 65 
## metadata(1): version
## assays(1): counts
## rownames(13): Aequoribacter CABYJX01 ... Parahaliea Pseudohaliea
## rowData names(0):
## colnames(65): TARA_030 TARA_031 ... MIME_001 MIME_002
## colData names(20): station_name sampling_date ... season condition
featureData <- data.frame(genome=rownames(cts_genus))
mcols(dds) <- DataFrame(mcols(dds), featureData)
mcols(dds)
## DataFrame with 13 rows and 1 column
##                        genome
##                   <character>
## Aequoribacter   Aequoribacter
## CABYJX01             CABYJX01
## Chromatocurvus Chromatocurvus
## Congregibacter Congregibacter
## GCA-2748505       GCA-2748505
## ...                       ...
## Kineobactrum     Kineobactrum
## Luminiphilus     Luminiphilus
## NZNC01                 NZNC01
## Parahaliea         Parahaliea
## Pseudohaliea     Pseudohaliea

Differential abundance analysis

Apply methodology by Anders et al. (2010) https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106#Sec2

# calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}

geoMeans = apply(counts(dds), 1, gm_mean)
dds = estimateSizeFactors(dds, geoMeans = geoMeans)

Differential abundances analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution with Wald test

dds <- DESeq(dds)
wtres <- results(dds)

Likelihood ratio test (chi-squared test) for GLMs to test for significance of change in deviance between a full (~biome) and reduced model.

dds <- estimateDispersionsGeneEst(dds)
## found already estimated gene-wise dispersions, removing these
dispersions(dds) <- mcols(dds)$dispGeneEst
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
dds <- nbinomLRT(dds, reduced = ~ 1)
## found results columns, replacing these
lrtres <- results(dds)

Select the species that differentially present between the conditions.

differential_genera = res %>% as.data.frame() %>%
  filter(padj < 0.05) %>%
  rownames()
differential_genera
## [1] "Aequoribacter"  "CABYJX01"       "Chromatocurvus" "Congregibacter"
## [5] "Halioglobus"    "Luminiphilus"   "Pseudohaliea"

Extract the normalised counts

ncts_genus = round(assay(normTransform(dds)), digits = 0)

Melt the normalised counts

mncts_genus = ncts_genus %>%
  t() %>% # transpose
  melt() # melt

# rename the columns
colnames(mncts_genus) = c("station_name", "genus", "count")

Possibility to inspect the resulting dataframe

Abundance heatmap

Prepare the matrix to plot the heatmap; here we subselect the differential species.

ordered_station_names = meta %>%
  filter(station_name %in% colnames(ncts_genus)) %>%
  arrange(biome, ocean, station_name) %>%
  select(station_name)

#matrix = ncts[rownames(ncts) %in% differential_species,] %>% unlist() %>%
#  as.data.frame() %>%
#  select(ordered_station_names$station_name)

matrix = ncts_genus %>% unlist() %>%
  as.data.frame() %>%
  select(ordered_station_names$station_name)

Set categorical variables and graphic parameters.

# significance levels
anno_sign = res %>% as.data.frame() %>%
  select(padj) %>%
  mutate(significance = gtools::stars.pval(padj)) %>%
  select(significance) %>%
  mutate(significance = replace(significance, significance %in% c(NA, ".", "", " "), "NS"))

anno <- as.data.frame(colData(dds)[,c("biome", "ocean", "season")])

# colour palette for the biomes
biome_colours <- brewer.pal(4, "RdBu")
names(biome_colours) <- unique(anno$biome)

# colour palette for the oceans
ocean_colours <- brewer.pal(6, "YlGnBu")[2:6]
names(ocean_colours) <- unique(anno$ocean)

season_colours = brewer.pal(4, "Pastel2")
names(season_colours) <- unique(anno$season)

significance_colours = c(brewer.pal(4, "Reds"))
names(significance_colours) <- c("NS", "*", "**", "***")

annoCol <- list(ocean = ocean_colours, 
  biome = biome_colours,
  season = season_colours,
  significance =significance_colours)

Check optimal number of clusters

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
df = scale(matrix)
# Elbow method
fviz_nbclust(df, kmeans, method = "wss") + #c("silhouette", "wss", "gap_stat"),
    geom_vline(xintercept = 4, linetype = 2)+
  labs(subtitle = "Elbow method")

Draw the heatmap

pheatmap(matrix, 
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "Spectral")))(100),
  cluster_rows = TRUE, 
  show_rownames = TRUE,
  show_colnames = TRUE,
  cluster_cols = FALSE,
  fontsize_row = 7,
  fontsize_col = 5,
  annotation_col = anno,
  annotation_row = anno_sign,
  border_color = "white", 
  clustering_method = "complete", # "ward.D"
  annotation_colors = annoCol,
  cutree_rows = 4,
  treeheight_row = 20,
  gaps_col = c(12, 29, 54),
  #main = "Global distribution of Halieaceae genera."
  ) %>% ggplotify::as.ggplot()

ggsave(
  "fig-s3.pdf",
  plot = last_plot(),
  device = "pdf",
  scale = 1,
  width = 8,
  height = 5,
  dpi = 300
)

Species level

Normalisation with DESeq

This is an alternative to the scaling. Indeed, DESeq needs un-normalised count data as input.

http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#the-deseqdataset

Merge count data with genome info

cts_species = counts %>% 
  mutate(id = rownames(counts)) %>%
  left_join(., genome_info, by = 'id') %>%
  group_by(gtdb_species) %>%
  summarise(across(starts_with(c("TARA","MIME")), mean)) %>%
  select(gtdb_species, starts_with(c("TARA","MIME"))) %>%
  as.data.frame() %>%
  filter(!is.na(gtdb_species))

rownames(cts_species) = cts_species$gtdb_species
cts_species = cts_species %>% select(starts_with(c("TARA","MIME")))
cts_species = round(cts_species, digits = 0)
coldata = meta %>% filter(station_name %in% colnames(cts_species))
rownames(coldata) = coldata$station_name

coldata = coldata %>% 
  mutate(condition = ifelse(biome == "Polar", "Polar", "Non Polar"))

all(rownames(coldata) == colnames(cts_species))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData = cts_species,
                              colData = coldata,
                              design = ~ biome) # biome or condition?
dds
## class: DESeqDataSet 
## dim: 69 65 
## metadata(1): version
## assays(1): counts
## rownames(69): Aequoribacter fuscus Aequoribacter sp003520285 ...
##   Parahaliea sp014077625 Pseudohaliea rubra
## rowData names(0):
## colnames(65): TARA_030 TARA_031 ... MIME_001 MIME_002
## colData names(20): station_name sampling_date ... season condition
featureData <- data.frame(genome=rownames(cts_species))
mcols(dds) <- DataFrame(mcols(dds), featureData)
mcols(dds)
## DataFrame with 69 rows and 1 column
##                                             genome
##                                        <character>
## Aequoribacter fuscus          Aequoribacter fuscus
## Aequoribacter sp003520285   Aequoribacter sp0035..
## CABYJX01 sp902519405          CABYJX01 sp902519405
## Chromatocurvus halotolerans Chromatocurvus halot..
## Congregibacter litoralis    Congregibacter litor..
## ...                                            ...
## Parahaliea aestuarii          Parahaliea aestuarii
## Parahaliea aestuarii_A      Parahaliea aestuarii_A
## Parahaliea mediterranea     Parahaliea mediterra..
## Parahaliea sp014077625      Parahaliea sp014077625
## Pseudohaliea rubra              Pseudohaliea rubra

Extract the normalised counts

ncts_species = round(assay(normTransform(dds)), digits = 0)
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

Melt the normalised counts

mncts_species = ncts_species %>%
  t() %>% # transpose
  melt() # melt

# rename the columns
colnames(mncts_species) = c("station_name", "species", "count")

Mean of species count

summary(t(ncts_species))[4,] %>% sort()
##     GCA-2748505 sp002748505    Luminiphilus sp002390485 
##           "Mean   :10.63  "           "Mean   :10.68  " 
##    Luminiphilus sp008081075  Congregibacter sp002428935 
##           "Mean   :10.75  "           "Mean   :10.92  " 
##     Halioglobus sp009937575    Luminiphilus sp013911345 
##           "Mean   :11.02  "           "Mean   :11.08  " 
##    Luminiphilus sp002389505          Haliea sp002414665 
##           "Mean   :11.17  "            "Mean   :11.2  " 
##    Luminiphilus sp009886815     Halioglobus sp013214285 
##           "Mean   :11.22  "           "Mean   :11.29  " 
##    Luminiphilus sp002691565     Halioglobus sp002862985 
##           "Mean   :11.48  "           "Mean   :11.52  " 
##    Luminiphilus sp002469785    Luminiphilus sp002683335 
##            "Mean   :11.6  "           "Mean   :11.62  " 
##    Luminiphilus sp002683395     Halioglobus_A pacificus 
##           "Mean   :11.62  "           "Mean   :11.66  " 
##    Luminiphilus sp002456975    Luminiphilus sp002689915 
##           "Mean   :11.72  "           "Mean   :11.75  " 
##    Luminiphilus sp003331335    Luminiphilus sp012270045 
##           "Mean   :11.78  "           "Mean   :11.88  " 
##   Aequoribacter sp003520285          Haliea sp002416125 
##           "Mean   :11.97  "           "Mean   :12.15  " 
##    Luminiphilus sp002377985     Halioglobus sp002699145 
##           "Mean   :12.17  "           "Mean   :12.18  " 
##     Halioglobus sp003646405     Halioglobus sp002862465 
##           "Mean   :12.22  "           "Mean   :12.28  " 
##    Luminiphilus sp003331685     Halioglobus sp008370495 
##           "Mean   :12.28  "           "Mean   :12.35  " 
##      Parahaliea aestuarii_A     Halioglobus sp004354085 
##           "Mean   :12.38  "           "Mean   :12.42  " 
##    Luminiphilus sp002703585    Luminiphilus sp009937065 
##           "Mean   :12.45  "           "Mean   :12.55  " 
##        Parahaliea aestuarii      Kineobactrum sediminis 
##           "Mean   :12.66  "           "Mean   :12.82  " 
##      Luminiphilus syltensis    Luminiphilus sp002705465 
##           "Mean   :12.88  "           "Mean   :12.91  " 
##       Halioglobus sediminis          NZNC01 sp002692965 
##           "Mean   :12.92  "           "Mean   :12.92  " 
## Chromatocurvus halotolerans    Luminiphilus sp002721785 
##           "Mean   :12.98  "           "Mean   :13.02  " 
##    Kineobactrum sp010669285    Luminiphilus sp902613175 
##           "Mean   :13.08  "           "Mean   :13.08  " 
##           Haliea alexandrii        Halioglobus maricola 
##           "Mean   :13.11  "           "Mean   :13.14  " 
##     Halioglobus sp004745945     Halioglobus sp003646415 
##           "Mean   :13.15  "           "Mean   :13.17  " 
##    Luminiphilus sp011523115    Luminiphilus sp003523185 
##           "Mean   :13.23  "           "Mean   :13.26  " 
##        Aequoribacter fuscus       Halioglobus lutimaris 
##           "Mean   :13.31  "           "Mean   :13.32  " 
##     Halioglobus sp000156295     Parahaliea mediterranea 
##           "Mean   :13.38  "            "Mean   :13.4  " 
##           Haliea salexigens    Luminiphilus sp902631395 
##           "Mean   :13.43  "           "Mean   :13.43  " 
##    Luminiphilus sp902547755          Pseudohaliea rubra 
##           "Mean   :13.46  "           "Mean   :13.63  " 
##     Halioglobus japonicus_A    Luminiphilus sp002862405 
##           "Mean   :13.66  "           "Mean   :13.69  " 
##    Congregibacter litoralis        CABYJX01 sp902519405 
##           "Mean   :13.85  "           "Mean   :13.91  " 
##  Congregibacter sp000158155    Luminiphilus sp000169115 
##           "Mean   :13.94  "           "Mean   :13.97  " 
##    Luminiphilus sp002696455      Parahaliea sp014077625 
##           "Mean   :14.02  "           "Mean   :14.15  " 
##    Luminiphilus sp000227505    Luminiphilus sp902518075 
##           "Mean   :14.34  "           "Mean   :14.63  " 
##    Luminiphilus sp902520195    Luminiphilus sp902623175 
##           "Mean   :15.08  "           "Mean   :15.22  " 
##       Halioglobus japonicus 
##           "Mean   :15.91  "

Possibility to inspect the resulting data frame

mncts_species %>%
  head()
left_join(mncts_species, meta) %>%
  group_by(biome, species) %>%
  summarise(across(where(is.numeric), c(mean = mean, sd = sd))) %>%
  arrange(count_mean)

left_join(mncts_species, meta) %>%
  arrange(count) %>%
  ggplot(., aes(x = species, y = count)) + geom_boxplot() + facet_wrap(~biome, ncol =1 ) + theme(axis.text.x = element_text(angle = 90))

Differential abundance analysis

Apply methodology by Anders et al. (2010) https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106#Sec2

# calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}

geoMeans = apply(counts(dds), 1, gm_mean)
dds = estimateSizeFactors(dds, geoMeans = geoMeans)

Differential abundances analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution with Wald test

dds <- DESeq(dds)
wtres <- results(dds)

Likelihood ratio test (chi-squared test) for GLMs to test for significance of change in deviance between a full (~biome) and reduced model.

dds <- estimateDispersionsGeneEst(dds)
## found already estimated gene-wise dispersions, removing these
dispersions(dds) <- mcols(dds)$dispGeneEst
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
dds <- nbinomLRT(dds, reduced = ~ 1)
## found results columns, replacing these
lrtres <- results(dds)

Select the species that differentially present between the conditions.

differential_species = res %>% as.data.frame() %>%
  filter(padj < 0.05) %>%
  rownames()
differential_species
##  [1] "Aequoribacter fuscus"       "Aequoribacter sp003520285" 
##  [3] "CABYJX01 sp902519405"       "Congregibacter litoralis"  
##  [5] "Congregibacter sp000158155" "Congregibacter sp002428935"
##  [7] "Haliea alexandrii"          "Haliea salexigens"         
##  [9] "Halioglobus japonicus"      "Halioglobus japonicus_A"   
## [11] "Halioglobus lutimaris"      "Halioglobus maricola"      
## [13] "Halioglobus sp000156295"    "Halioglobus sp002699145"   
## [15] "Halioglobus sp002862465"    "Halioglobus sp002862985"   
## [17] "Halioglobus sp004354085"    "Halioglobus sp009937575"   
## [19] "Halioglobus sp013214285"    "Halioglobus_A pacificus"   
## [21] "Kineobactrum sp010669285"   "Luminiphilus sp000227505"  
## [23] "Luminiphilus sp002377985"   "Luminiphilus sp002389505"  
## [25] "Luminiphilus sp002390485"   "Luminiphilus sp002683335"  
## [27] "Luminiphilus sp002683395"   "Luminiphilus sp002696455"  
## [29] "Luminiphilus sp002703585"   "Luminiphilus sp002705465"  
## [31] "Luminiphilus sp002721785"   "Luminiphilus sp003331685"  
## [33] "Luminiphilus sp003523185"   "Luminiphilus sp008081075"  
## [35] "Luminiphilus sp009886815"   "Luminiphilus sp011523115"  
## [37] "Luminiphilus sp012270045"   "Luminiphilus sp013911345"  
## [39] "Luminiphilus sp902518075"   "Luminiphilus sp902520195"  
## [41] "Luminiphilus sp902547755"   "Luminiphilus sp902613175"  
## [43] "Luminiphilus sp902623175"   "Luminiphilus sp902631395"  
## [45] "Parahaliea sp014077625"

Abundance heatmap

Prepare the matrix to plot the heatmap; here we subselect the differential species.

ordered_station_names = meta %>%
  filter(station_name %in% colnames(ncts_species)) %>%
  arrange(biome, ocean, station_name) %>%
  select(station_name)

#matrix = ncts[rownames(ncts) %in% differential_species,] %>% unlist() %>%
#  as.data.frame() %>%
#  select(ordered_station_names$station_name)

matrix = ncts_species %>% unlist() %>%
  as.data.frame() %>%
  select(ordered_station_names$station_name)

Check number of clusters

library(factoextra)
library(NbClust)
df = scale(matrix)
# Elbow method
fviz_nbclust(df, kmeans, method = "wss") + #c("silhouette", "wss", "gap_stat"),
    geom_vline(xintercept = 4, linetype = 2)+
  labs(subtitle = "Elbow method")

Set categorical variables and graphic parameters.

# significance levels
anno_sign = res %>% as.data.frame() %>%
  select(padj) %>%
  mutate(significance = gtools::stars.pval(padj)) %>%
  select(significance) %>%
  mutate(significance = replace(significance, significance %in% c(NA, ".", "", " "), "NS"))

anno <- as.data.frame(colData(dds)[,c("biome", "ocean", "season")])

# colour palette for the biomes
biome_colours <- brewer.pal(4, "RdBu")
names(biome_colours) <- unique(anno$biome)

# colour palette for the oceans
ocean_colours <- brewer.pal(6, "YlGnBu")[2:6]
names(ocean_colours) <- unique(anno$ocean)

season_colours = brewer.pal(4, "Pastel2")
names(season_colours) <- unique(anno$season)

significance_colours = c(brewer.pal(4, "Reds"))
names(significance_colours) <- c("NS", "*", "**", "***")

annoCol <- list(ocean = ocean_colours, 
  biome = biome_colours,
  season = season_colours,
  significance =significance_colours)

Draw the heatmap

hm = pheatmap(matrix, 
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "Spectral")))(100),
  cluster_rows = TRUE, 
  show_rownames = TRUE,
  show_colnames = TRUE,
  cluster_cols = FALSE,
  fontsize_row = 7,
  annotation_col = anno,
  annotation_row = anno_sign,
  border_color = "white", 
  clustering_method = "ward.D", #ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
  annotation_colors = annoCol,
  cutree_rows = 4, #gaps_row = c(2,3,4,7,8,12,29,31,63,64,68),
  treeheight_row = 20,
  gaps_col = c(12, 29, 54),
  
  #main = "Abundance heatmap"
  )

pheatmap(matrix, 
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "Spectral")))(100),
  cluster_rows = TRUE, 
  show_rownames = TRUE,
  show_colnames = TRUE,
  cluster_cols = FALSE,
  fontsize_row = 7,
  fontsize_col = 5,
  annotation_col = anno,
  annotation_row = anno_sign,
  border_color = "white", 
  clustering_method = "ward.D", 
  annotation_colors = annoCol,
  cutree_rows = 4,
  treeheight_row = 20,
  gaps_col = c(12, 29, 54),
  #main = "Abundance heatmap"
  ) %>% ggplotify::as.ggplot()

ggsave(
  "fig-3.pdf",
  plot = last_plot(),
  device = "pdf",
  scale = 1,
  width = 8,
  height = 8,
  dpi = 300
)

extract clusters

clust <- cbind(matrix, 
  cluster = cutree(hm$tree_row, 
    k = 4))

ubiquitous = clust %>%
  filter(cluster == 1) %>% rownames()
rare = clust %>%
  filter(cluster == 2) %>% rownames()
coldtolerant = clust %>%
  filter(cluster == 3) %>% rownames()
coldintolerant = clust %>%
  filter(cluster == 4) %>% rownames()

#order_cluster = c(ubiquitous, rare, coldtolerant, coldintolerant)
order_cluster <- rev(rownames(matrix[c(hm$tree_row[["order"]]),]))

Correlations

A negative binomial (NB) generalised linear model (GLM) was fit for each species against each of the variables in the metadata separately, using Deseq2.

A Wald test was then performed on each fit to assess the significance of the independent variable, followed by correction of the p values using the Benjamini−Hochberg (BH) method.

What genera display differential abundances depending on the temperature Fit NB GLM to the relationship between the abundances (counts) and the temperature (abundance as a function of temperature).

The Wald test can also be used with continuous variables such environmental measurements (temperature, salinity, etc.). In this case, the reported log2 fold change is per unit of change of that variable (“log2 fold change per degree C differences.”)

Wald test for the GLM coefficients

This function tests for significance of coefficients in a Negative Binomial GLM, using previously calculated sizeFactors (or normalizationFactors) and dispersion estimates.

The fitting proceeds as follows: standard maximum likelihood estimates for GLM coefficients (synonymous with “beta”, “log2 fold change”, “effect size”) are calculated.

For calculating Wald test p-values, the coefficients are scaled by their standard errors and then compared to a standard Normal distribution. The results function without any arguments will automatically perform a contrast of the last level of the last variable in the design formula over the first level. The contrast argument of the results function can be used to generate other comparisons.

Missing measurement for MIME samples; exclude these samples.

cts_species = cts_species %>% select(starts_with("TARA"))
coldata = filter(coldata, grepl("TARA",station_name))

Temperature

Fit NB GLM and test for the significance of the coefficients using the Wald test.

dds <- DESeqDataSetFromMatrix(countData = cts_species,
                              colData = coldata,
                              design = ~ temperature) 
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)
 
restemp = as.data.frame(results(dds)) %>% 
  mutate(variable = "temperature") %>% 
  add_rownames(., var = 'genus') %>%
  select(genus, variable, log2FoldChange, padj) %>%
  mutate(sign = gtools::stars.pval(padj)) %>%
  mutate(genus = factor(genus, levels = order_cluster))

Salinity

Fit NB GLM and test for the significance of the coefficients using the Wald test.

dds <- DESeqDataSetFromMatrix(countData = cts_species,
                              colData = coldata,
                              design = ~ salinity) # biome or condition?
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)

ressalinity = as.data.frame(results(dds)) %>% 
  mutate(variable = "salinity") %>% 
  add_rownames(., var = 'genus') %>%
  select(genus, variable, log2FoldChange, padj) %>%
  mutate(sign = gtools::stars.pval(padj)) %>%
  mutate(genus = factor(genus, levels = order_cluster))

Nitrate

Fit NB GLM and test for the significance of the coefficients using the Wald test.

dds <- DESeqDataSetFromMatrix(countData = cts_species,
                              colData = coldata,
                              design = ~ nitrate) 
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)

resnitrate = as.data.frame(results(dds)) %>% 
  mutate(variable = "nitrate") %>% 
  add_rownames(., var = 'genus') %>%
  select(genus, variable, log2FoldChange, padj) %>%
  mutate(sign = gtools::stars.pval(padj)) %>%
  mutate(genus = factor(genus, levels = order_cluster))

Oxygen

Fit NB GLM and test for the significance of the coefficients using the Wald test.

dds <- DESeqDataSetFromMatrix(countData = cts_species,
                              colData = coldata,
                              design = ~ oxygen) 
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)

resoxygen = as.data.frame(results(dds)) %>% 
  mutate(variable = "oxygen") %>% 
  add_rownames(., var = 'genus') %>%
  select(genus, variable, log2FoldChange, padj) %>%
  mutate(sign = gtools::stars.pval(padj)) %>%
  mutate(genus = factor(genus, levels = order_cluster))

Chlorophyll

Fit NB GLM and test for the significance of the coefficients using the Wald test.

dds <- DESeqDataSetFromMatrix(countData = cts_species,
                              colData = coldata,
                              design = ~ chlorophyll) 
dds = DESeq(dds)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)

reschlorophyll = as.data.frame(results(dds)) %>% 
  mutate(variable = "chlorophyll") %>% 
  add_rownames(., var = 'genus') %>%
  select(genus, variable, log2FoldChange, padj) %>%
  mutate(sign = gtools::stars.pval(padj)) %>%
  mutate(genus = factor(genus, levels = order_cluster))

Heatmap

Set colour scheme for each environmental variable (according to the CPK coloring of chemical elements)

coltemp = scale_fill_distiller(palette = "RdBu")

colsalinity = scale_fill_gradient2(
  midpoint = 0, mid = "white",
  low = brewer.pal(8,"Greys")[6],
  high = brewer.pal(5,"Purples")[5])

colnitrate = scale_fill_gradient2(
  midpoint = 0, mid = "white",
  low = brewer.pal(8,"Greys")[6],
  high = brewer.pal(5,"Blues")[5])

coloxygen = scale_fill_distiller(palette = "RdGy")

colchlorophyll = scale_fill_gradient2(
  midpoint = 0, mid = "white",
  low = brewer.pal(8,"Greys")[6],
  high = brewer.pal(5,"Greens")[5])

Cleanup ggplot theme

cleanup = theme_void() + theme(legend.position = "bottom",
  axis.text.x = element_text(size = 10),
  legend.title = element_blank(),
  legend.direction = "horizontal",
  legend.key.size = unit(8, 'pt'),
  legend.text = element_text(size = 5, angle = 45, vjust = 1),
  plot.margin = margin(l = -.5, r = -.5, unit = "cm"))

Generate all plots

p1 = ggplot(restemp, aes(x = variable, y = genus, fill=log2FoldChange)) + 
  geom_tile(colour = "white", lwd = .5, linetype = 1) +
  geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") + 
  coltemp + cleanup

p2 = ggplot(ressalinity, aes(x = variable, y = genus, fill=log2FoldChange)) + 
  geom_tile(colour = "white", lwd = .5, linetype = 1) +
  geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") + 
  colsalinity + cleanup

p3 = ggplot(resnitrate, aes(x = variable, y = genus, fill=log2FoldChange)) + 
  geom_tile(colour = "white", lwd = .5, linetype = 1) +
  geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") +
  colnitrate + cleanup

p4 = ggplot(resoxygen, aes(x = variable, y = genus, fill=log2FoldChange)) + 
  geom_tile(colour = "white", lwd = .5, linetype = 1) +
  geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") + 
  coloxygen + cleanup

p5 = ggplot(reschlorophyll, aes(x = variable, y = genus, fill=log2FoldChange)) + 
  geom_tile(colour = "white", lwd = .5, linetype = 1) +
  geom_text(aes(label=sign), angle=0, vjust=-.2, colour = "black") + 
  colchlorophyll +cleanup
cowplot::plot_grid(NULL, 
  p1 + theme(axis.text.y = element_text(size = 6, hjust = 1)), NULL,
  p2, NULL,  p3, NULL, p4, NULL, p5, 
  ncol = 10, 
  align = 'hv', axis  = 'tb', 
  rel_widths = c(0.4,1,-0.4,1,-0.4,1,-0.4,1,-0.4,1)) %>% 
  ggplotify::as.ggplot() +
  labs(#title = "Envrionmental variables correlated with Halieaceae genera abundances.",
  #  subtitle = "Envrionmental data were obtained from the TARA Ocean data set.\n",
    #caption = "\nSignificant correlations between genera and chemo-physical properties.\n Correlations were assessed using a negative binomial GLM for each Halieaceae genera.\n Gradients represent the log2 fold change and significance level\nrepresent the pvalue from Wald-test adjusted with the Benjamini−Hochberg procedure."
    )

ggsave(
  "fig-4.pdf",
  plot = last_plot(),
  device = "pdf",
  scale = 0.9,
  width = 10,
  height = 10,
  dpi = 300
)

https://swilke-geoscience.net/post/spatial_interpolation/

data = mperc_genus #%>%
  #filter(species == "Halioglobus sp009937575")
  
map +
  geom_point(aes(
    x = data$longitude, 
    y = data$latitude,
    size = data$abundance,
    colour = data$genus))

Chi square

temptab = data.frame(
  S = c(10,14,4,10), # number of species with significant correlation with temp in each cluster
  NS = c(18,13,0,0) # number of species with NS correlation with temp in each cluster
)
chisq.test(temptab, correct=TRUE)
## Warning in chisq.test(temptab, correct = TRUE): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  temptab
## X-squared = 15.775, df = 3, p-value = 0.001261
salinitytab = data.frame(
  S = c(1,3,0,0), 
  NS = c(27,24,4,10) 
)
chisq.test(salinitytab, correct=TRUE)
## Warning in chisq.test(salinitytab, correct = TRUE): Chi-squared approximation
## may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  salinitytab
## X-squared = 2.5117, df = 3, p-value = 0.4732
nitratetab = data.frame(
  S = c(1,3,0,0), 
  NS = c(27,24,4,10) 
)


S = c(); NS = c()
for (i in 1:4) {
  cc = clust %>%
  filter(cluster == i) %>% rownames() 
  S = append(S,
  resnitrate %>% filter(genus %in% cc) %>% 
    filter(padj <= 0.001) %>% 
    summarise(n = n()) %>%  pull())
  NS = append(NS,
  resnitrate %>% filter(genus %in% cc) %>% 
    filter(padj > 0.001) %>% 
    summarise(n = n()) %>%  pull())
}

tabnitrate = data.frame(S, NS)